{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2d380326-95ec-47d2-9b26-61a70eda1f42",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd  \n",
    "data1 = pd.read_excel('./data/healthcare-dataset-stroke.xlsx')  \n",
    "data2 = pd.read_excel('./data/healthcare-dataset-age_abs.xlsx')  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "d84cbd8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "表healthcare-dataset-stroke.xlsx数据量：1767\n",
      "表healthcare-dataset-stroke.xlsx数据量：1767\n"
     ]
    }
   ],
   "source": [
    "# （1）查看两张表的数据量\n",
    "print(f\"表healthcare-dataset-stroke.xlsx数据量：{len(data1)}\")\n",
    "print(f\"表healthcare-dataset-stroke.xlsx数据量：{len(data2)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "60ce06c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>性别</th>\n",
       "      <th>高血压</th>\n",
       "      <th>是否结婚</th>\n",
       "      <th>工作类型</th>\n",
       "      <th>居住类型</th>\n",
       "      <th>体重指数</th>\n",
       "      <th>吸烟史</th>\n",
       "      <th>中风</th>\n",
       "      <th>年龄</th>\n",
       "      <th>平均血糖</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>84</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.5</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>55.0</td>\n",
       "      <td>89.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>99</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>52.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>31.0</td>\n",
       "      <td>108.89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>129</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>26.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>24.0</td>\n",
       "      <td>97.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>132</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>政府工作</td>\n",
       "      <td>城市</td>\n",
       "      <td>NaN</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>80.0</td>\n",
       "      <td>84.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>156</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>42.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>33.0</td>\n",
       "      <td>86.97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1762</th>\n",
       "      <td>72836</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.1</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>59.0</td>\n",
       "      <td>65.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1763</th>\n",
       "      <td>72861</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>20.1</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>52.0</td>\n",
       "      <td>69.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1764</th>\n",
       "      <td>72882</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>25.0</td>\n",
       "      <td>以前吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>47.0</td>\n",
       "      <td>75.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1765</th>\n",
       "      <td>72911</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>60.9</td>\n",
       "      <td>抽烟</td>\n",
       "      <td>否</td>\n",
       "      <td>57.0</td>\n",
       "      <td>129.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1766</th>\n",
       "      <td>72918</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>30.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>是</td>\n",
       "      <td>53.0</td>\n",
       "      <td>62.55</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1767 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         编号 性别 高血压 是否结婚  工作类型 居住类型  体重指数   吸烟史 中风    年龄    平均血糖\n",
       "0        84  男   否    是    私人   城市  31.5  从不吸烟  否  55.0   89.17\n",
       "1        99  女   否    否    私人   城市  52.3    未知  否  31.0  108.89\n",
       "2       129  女   否    否    私人   城市  26.2  从不吸烟  否  24.0   97.55\n",
       "3       132  女   否    是  政府工作   城市   NaN    未知  否  80.0   84.86\n",
       "4       156  女   否    是    私人   农村  42.2  从不吸烟  否  33.0   86.97\n",
       "...     ... ..  ..  ...   ...  ...   ...   ... ..   ...     ...\n",
       "1762  72836  女   否    是    私人   城市  31.1    未知  否  59.0   65.98\n",
       "1763  72861  女   否    是    私人   城市  20.1  从不吸烟  否  52.0   69.30\n",
       "1764  72882  男   否    是    私人   农村  25.0  以前吸烟  否  47.0   75.30\n",
       "1765  72911  女   是    是    私人   农村  60.9    抽烟  否  57.0  129.54\n",
       "1766  72918  女   是    是    私人   城市  30.3    未知  是  53.0   62.55\n",
       "\n",
       "[1767 rows x 11 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# （2）以编号作为主键进行外连接；\n",
    "# on='编号'主键列\n",
    "# how='outer' 合并方式，表示执行 外连接（Outer Join）\n",
    "merged_data = pd.merge(data1, data2, on='编号', how='outer')  \n",
    "merged_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "f9e3b6dd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>性别</th>\n",
       "      <th>高血压</th>\n",
       "      <th>是否结婚</th>\n",
       "      <th>工作类型</th>\n",
       "      <th>居住类型</th>\n",
       "      <th>体重指数</th>\n",
       "      <th>吸烟史</th>\n",
       "      <th>中风</th>\n",
       "      <th>年龄</th>\n",
       "      <th>平均血糖</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>84</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.5</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>55.0</td>\n",
       "      <td>89.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>99</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>52.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>31.0</td>\n",
       "      <td>108.89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>129</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>26.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>24.0</td>\n",
       "      <td>97.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>132</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>政府工作</td>\n",
       "      <td>城市</td>\n",
       "      <td>NaN</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>80.0</td>\n",
       "      <td>84.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>156</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>42.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>33.0</td>\n",
       "      <td>86.97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1762</th>\n",
       "      <td>72836</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.1</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>59.0</td>\n",
       "      <td>65.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1763</th>\n",
       "      <td>72861</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>20.1</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>52.0</td>\n",
       "      <td>69.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1764</th>\n",
       "      <td>72882</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>25.0</td>\n",
       "      <td>以前吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>47.0</td>\n",
       "      <td>75.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1765</th>\n",
       "      <td>72911</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>60.9</td>\n",
       "      <td>抽烟</td>\n",
       "      <td>否</td>\n",
       "      <td>57.0</td>\n",
       "      <td>129.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1766</th>\n",
       "      <td>72918</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>30.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>是</td>\n",
       "      <td>53.0</td>\n",
       "      <td>62.55</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1737 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         编号 性别 高血压 是否结婚  工作类型 居住类型  体重指数   吸烟史 中风    年龄    平均血糖\n",
       "0        84  男   否    是    私人   城市  31.5  从不吸烟  否  55.0   89.17\n",
       "1        99  女   否    否    私人   城市  52.3    未知  否  31.0  108.89\n",
       "2       129  女   否    否    私人   城市  26.2  从不吸烟  否  24.0   97.55\n",
       "3       132  女   否    是  政府工作   城市   NaN    未知  否  80.0   84.86\n",
       "4       156  女   否    是    私人   农村  42.2  从不吸烟  否  33.0   86.97\n",
       "...     ... ..  ..  ...   ...  ...   ...   ... ..   ...     ...\n",
       "1762  72836  女   否    是    私人   城市  31.1    未知  否  59.0   65.98\n",
       "1763  72861  女   否    是    私人   城市  20.1  从不吸烟  否  52.0   69.30\n",
       "1764  72882  男   否    是    私人   农村  25.0  以前吸烟  否  47.0   75.30\n",
       "1765  72911  女   是    是    私人   农村  60.9    抽烟  否  57.0  129.54\n",
       "1766  72918  女   是    是    私人   城市  30.3    未知  是  53.0   62.55\n",
       "\n",
       "[1737 rows x 11 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# （1）获取年龄特征；\n",
    "age_column = merged_data['年龄']  \n",
    "# （2）利用for循环获取年龄特征中的数值，并用if_else语句判断年龄数值是否为异常值；\n",
    "# （3）若年龄数值为异常值，则删除异常值。\n",
    "cleaned_data = merged_data\n",
    "for i in range(len(age_column)):\n",
    "    if not age_column[i] == int(age_column[i]):\n",
    "        cleaned_data = cleaned_data.drop(index=i)\n",
    "cleaned_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "648c5e80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>性别</th>\n",
       "      <th>高血压</th>\n",
       "      <th>是否结婚</th>\n",
       "      <th>工作类型</th>\n",
       "      <th>居住类型</th>\n",
       "      <th>体重指数</th>\n",
       "      <th>吸烟史</th>\n",
       "      <th>中风</th>\n",
       "      <th>年龄</th>\n",
       "      <th>平均血糖</th>\n",
       "      <th>年龄分组</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>84</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.5</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>55.0</td>\n",
       "      <td>89.17</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>99</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>52.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>31.0</td>\n",
       "      <td>108.89</td>\n",
       "      <td>21-40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>129</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>否</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>26.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>24.0</td>\n",
       "      <td>97.55</td>\n",
       "      <td>21-40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>132</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>政府工作</td>\n",
       "      <td>城市</td>\n",
       "      <td>NaN</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>80.0</td>\n",
       "      <td>84.86</td>\n",
       "      <td>61-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>156</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>42.2</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>33.0</td>\n",
       "      <td>86.97</td>\n",
       "      <td>21-40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1762</th>\n",
       "      <td>72836</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>31.1</td>\n",
       "      <td>未知</td>\n",
       "      <td>否</td>\n",
       "      <td>59.0</td>\n",
       "      <td>65.98</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1763</th>\n",
       "      <td>72861</td>\n",
       "      <td>女</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>20.1</td>\n",
       "      <td>从不吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>52.0</td>\n",
       "      <td>69.30</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1764</th>\n",
       "      <td>72882</td>\n",
       "      <td>男</td>\n",
       "      <td>否</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>25.0</td>\n",
       "      <td>以前吸烟</td>\n",
       "      <td>否</td>\n",
       "      <td>47.0</td>\n",
       "      <td>75.30</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1765</th>\n",
       "      <td>72911</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>农村</td>\n",
       "      <td>60.9</td>\n",
       "      <td>抽烟</td>\n",
       "      <td>否</td>\n",
       "      <td>57.0</td>\n",
       "      <td>129.54</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1766</th>\n",
       "      <td>72918</td>\n",
       "      <td>女</td>\n",
       "      <td>是</td>\n",
       "      <td>是</td>\n",
       "      <td>私人</td>\n",
       "      <td>城市</td>\n",
       "      <td>30.3</td>\n",
       "      <td>未知</td>\n",
       "      <td>是</td>\n",
       "      <td>53.0</td>\n",
       "      <td>62.55</td>\n",
       "      <td>41-60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1737 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         编号 性别 高血压 是否结婚  工作类型 居住类型  体重指数   吸烟史 中风    年龄    平均血糖   年龄分组\n",
       "0        84  男   否    是    私人   城市  31.5  从不吸烟  否  55.0   89.17  41-60\n",
       "1        99  女   否    否    私人   城市  52.3    未知  否  31.0  108.89  21-40\n",
       "2       129  女   否    否    私人   城市  26.2  从不吸烟  否  24.0   97.55  21-40\n",
       "3       132  女   否    是  政府工作   城市   NaN    未知  否  80.0   84.86  61-80\n",
       "4       156  女   否    是    私人   农村  42.2  从不吸烟  否  33.0   86.97  21-40\n",
       "...     ... ..  ..  ...   ...  ...   ...   ... ..   ...     ...    ...\n",
       "1762  72836  女   否    是    私人   城市  31.1    未知  否  59.0   65.98  41-60\n",
       "1763  72861  女   否    是    私人   城市  20.1  从不吸烟  否  52.0   69.30  41-60\n",
       "1764  72882  男   否    是    私人   农村  25.0  以前吸烟  否  47.0   75.30  41-60\n",
       "1765  72911  女   是    是    私人   农村  60.9    抽烟  否  57.0  129.54  41-60\n",
       "1766  72918  女   是    是    私人   城市  30.3    未知  是  53.0   62.55  41-60\n",
       "\n",
       "[1737 rows x 12 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# （1）获取年龄特征；\n",
    "age_data = cleaned_data['年龄']  \n",
    "# （2）使用等宽法对年龄特征进行离散化。\n",
    "bins = [0, 20, 40, 60, 80, 100]  \n",
    "labels = ['0-20', '21-40', '41-60', '61-80', '81-100']  \n",
    "cleaned_data['年龄分组'] = pd.cut(cleaned_data['年龄'], bins=bins, labels=labels)  \n",
    "cleaned_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6272b43f-f127-46a8-9d6d-31761e155552",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
